// 文档 https://github.com/hooke007/MPV_lazy/wiki/4_GLSL

// CuNNy veryfast SOFT
// Copyright (c) 2024 funnyplanter

// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3.0 of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program.  If not, see <https://www.gnu.org/licenses/>.
/* ------------------------------------------------------------------- */


//!DESC [CuNNy_veryfast_SOFT] -in
//!HOOK LUMA
//!COMPUTE 16 8 8 8
//!BIND LUMA
//!SAVE in
//!WIDTH LUMA.w 2 *
//!HEIGHT LUMA.h
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif
#define l0(x, y) F((LUMA_mul * texelFetch(LUMA_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(1, 1) + ivec2(0, 0), 0)).r)
shared F G[1][10][10];
void hook() {
	ivec2 xy = ivec2(gl_LocalInvocationID.xy);
	ivec2 pos = ivec2(gl_WorkGroupID.xy) * ivec2(8, 8) + xy;
	ivec2 opos = pos * ivec2(2, 1);
	ivec2 sz = ivec2(LUMA_size) - ivec2(1);
	for (int y = 0; y < 10; y += 8) {
		int ay = xy.y + y;
		if (ay >= 10) break;
		for (int x = 0; x < 10; x += 8) {
			int ax = xy.x + x;
			if (ax >= 10) break;
			G[0][ay][ax] = l0(x - 1, y - 1);
		}
	}
	barrier();
	F s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2;
	V4 r0, r1;
	r0 = V4(0.0); r1 = V4(0.0);
	s0_0_0 = G[0][xy.y+0][xy.x+0]; s0_0_1 = G[0][xy.y+0][xy.x+1];
	s0_0_2 = G[0][xy.y+0][xy.x+2]; s0_1_0 = G[0][xy.y+1][xy.x+0];
	s0_1_1 = G[0][xy.y+1][xy.x+1]; s0_1_2 = G[0][xy.y+1][xy.x+2];
	s0_2_0 = G[0][xy.y+2][xy.x+0]; s0_2_1 = G[0][xy.y+2][xy.x+1];
	s0_2_2 = G[0][xy.y+2][xy.x+2];
	r0 += V4(-4.261e-02, -3.667e-02, -1.154e-02, -1.987e-01) * s0_0_0;
	r1 += V4(1.432e-02, -1.051e-02, 2.980e-03, 6.233e-02) * s0_0_0;
	r0 += V4(1.150e+00, 3.870e-02, -1.912e-02, -1.018e-01) * s0_0_1;
	r1 += V4(2.738e-02, 7.880e-02, -2.794e-02, 2.159e-01) * s0_0_1;
	r0 += V4(1.466e-02, 1.814e-02, 3.406e-02, 3.213e-01) * s0_0_2;
	r1 += V4(-4.349e-02, -5.303e-02, 2.717e-02, -8.656e-02) * s0_0_2;
	r0 += V4(-6.192e-03, 3.412e-01, 1.076e+00, 8.554e-01) * s0_1_0;
	r1 += V4(-3.558e-02, 5.132e-02, -1.138e-02, 3.005e-01) * s0_1_0;
	r0 += V4(-1.105e+00, 2.220e-01, -9.474e-01, -9.155e-01) * s0_1_1;
	r1 += V4(-1.059e+00, -1.333e-01, -1.073e+00, -1.152e+00) * s0_1_1;
	r0 += V4(-1.126e-02, 1.069e-02, -1.051e-01, 7.108e-03) * s0_1_2;
	r1 += V4(1.090e+00, 2.574e-01, -2.334e-02, 2.402e-01) * s0_1_2;
	r0 += V4(4.406e-02, 8.321e-02, -4.105e-02, 2.105e-01) * s0_2_0;
	r1 += V4(1.507e-02, -1.744e-02, 3.503e-05, -1.016e-01) * s0_2_0;
	r0 += V4(-4.078e-02, -7.062e-01, -6.218e-02, 4.684e-02) * s0_2_1;
	r1 += V4(-1.002e-02, -1.030e-01, 1.098e+00, 2.858e-01) * s0_2_1;
	r0 += V4(-5.676e-03, 1.173e-02, 7.469e-02, -2.327e-01) * s0_2_2;
	r1 += V4(-5.805e-03, -4.049e-02, 4.043e-03, 4.426e-02) * s0_2_2;
	r0 += V4(-1.137e-03, -3.262e-04, -2.050e-03, -2.276e-03);
	r0 = clamp(r0, V4(0.0), V4(1.0));
	imageStore(out_image, opos + ivec2(0, 0), vec4(r0));
	r1 += V4(-5.205e-04, 1.480e-02, 2.195e-05, -1.857e-03);
	r1 = clamp(r1, V4(0.0), V4(1.0));
	imageStore(out_image, opos + ivec2(1, 0), vec4(r1));
}

//!DESC [CuNNy_veryfast_SOFT] -conv1
//!HOOK LUMA
//!COMPUTE 16 8 8 8
//!BIND in
//!BIND LUMA
//!SAVE conv1
//!WIDTH LUMA.w 2 *
//!HEIGHT LUMA.h
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif
#define l0(x, y) V4((in_mul * texelFetch(in_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(2, 1) + ivec2(0, 0), 0)))
#define l1(x, y) V4((in_mul * texelFetch(in_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(2, 1) + ivec2(1, 0), 0)))
shared V4 G[2][10][10];
void hook() {
	ivec2 xy = ivec2(gl_LocalInvocationID.xy);
	ivec2 pos = ivec2(gl_WorkGroupID.xy) * ivec2(8, 8) + xy;
	ivec2 opos = pos * ivec2(2, 1);
	ivec2 sz = ivec2(LUMA_size) - ivec2(1);
	for (int y = 0; y < 10; y += 8) {
		int ay = xy.y + y;
		if (ay >= 10) break;
		for (int x = 0; x < 10; x += 8) {
			int ax = xy.x + x;
			if (ax >= 10) break;
			G[0][ay][ax] = l0(x - 1, y - 1);
			G[1][ay][ax] = l1(x - 1, y - 1);
		}
	}
	barrier();
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0, r1;
	r0 = V4(0.0); r1 = V4(0.0);
	s0_0_0 = G[0][xy.y+0][xy.x+0]; s0_0_1 = G[0][xy.y+0][xy.x+1];
	s0_0_2 = G[0][xy.y+0][xy.x+2]; s0_1_0 = G[0][xy.y+1][xy.x+0];
	s0_1_1 = G[0][xy.y+1][xy.x+1]; s0_1_2 = G[0][xy.y+1][xy.x+2];
	s0_2_0 = G[0][xy.y+2][xy.x+0]; s0_2_1 = G[0][xy.y+2][xy.x+1];
	s0_2_2 = G[0][xy.y+2][xy.x+2]; s1_0_0 = G[1][xy.y+0][xy.x+0];
	s1_0_1 = G[1][xy.y+0][xy.x+1]; s1_0_2 = G[1][xy.y+0][xy.x+2];
	s1_1_0 = G[1][xy.y+1][xy.x+0]; s1_1_1 = G[1][xy.y+1][xy.x+1];
	s1_1_2 = G[1][xy.y+1][xy.x+2]; s1_2_0 = G[1][xy.y+2][xy.x+0];
	s1_2_1 = G[1][xy.y+2][xy.x+1]; s1_2_2 = G[1][xy.y+2][xy.x+2];
	r0 += M4(-2.317e-02, 5.272e-02, -3.439e-02, -1.015e-01, 2.302e-01, -3.604e-01, 3.614e-02, 1.551e-01, -8.387e-03, 1.322e-01, 1.868e-02, -1.441e-01, -5.112e-02, -2.141e-02, -5.701e-02, 6.522e-02) * s0_0_0;
	r1 += M4(9.446e-03, -1.047e-02, 2.699e-02, 1.361e-01, 3.270e-01, 3.117e-03, 4.701e-02, 1.144e-01, -2.290e-01, 3.265e-03, -5.005e-03, -7.935e-02, 2.060e-01, -5.508e-02, 2.585e-02, 1.577e-01) * s0_0_0;
	r0 += M4(1.013e-01, -7.155e-02, 8.903e-03, 9.070e-03, 1.000e+00, -6.657e-02, 6.601e-03, -3.300e-01, -2.582e-01, -1.388e-01, 2.061e-01, 5.890e-02, 1.291e-01, 1.801e-01, -3.144e-01, -2.903e-02) * s0_0_1;
	r1 += M4(6.140e-02, -2.030e-03, 3.199e-02, -7.252e-02, -1.210e-01, -1.077e-01, -1.639e-01, -3.029e-01, 2.073e-01, 5.413e-02, -1.252e-01, -4.111e-01, 2.313e-01, 8.847e-03, 1.771e-01, -1.342e-01) * s0_0_1;
	r0 += M4(-3.009e-02, -6.471e-02, 9.174e-02, -3.098e-02, 4.093e-01, -2.255e-01, 7.047e-02, 2.416e-02, 9.091e-03, -1.937e-01, 1.631e-01, 6.546e-03, -1.712e-01, 1.555e-01, -2.085e-01, -4.236e-02) * s0_0_2;
	r1 += M4(-7.917e-02, -1.012e-02, 1.778e-02, -6.940e-02, -1.622e-01, 1.324e-01, 2.686e-01, -2.105e-01, 3.848e-02, -6.404e-02, -3.179e-02, -6.604e-02, 1.613e-01, 1.367e-02, -4.429e-03, 1.431e-01) * s0_0_2;
	r0 += M4(-7.480e-01, 1.948e-01, -8.205e-02, -1.245e-01, -1.374e-01, -1.338e-01, -1.935e-01, 2.278e-02, -5.237e-02, 8.106e-02, 1.817e-01, -1.952e-01, 6.013e-04, 1.929e-01, -1.929e-01, 6.607e-02) * s0_1_0;
	r1 += M4(-2.201e-01, -4.525e-02, 1.480e-01, 3.586e-01, 1.890e-02, 5.608e-02, 6.244e-02, 1.859e-01, -5.123e-01, -1.687e-02, -1.088e-02, -4.600e-01, 4.733e-01, 2.169e-02, -6.417e-02, 4.324e-01) * s0_1_0;
	r0 += M4(-1.000e+00, 1.159e-01, 7.523e-02, 1.068e-01, 5.582e-01, -4.866e-01, -2.840e-02, 3.717e-02, -7.529e-01, -4.160e-01, 2.048e-01, 7.513e-01, -3.195e-01, 6.345e-01, -1.552e-01, -1.220e-01) * s0_1_1;
	r1 += M4(7.022e-03, 6.587e-02, 3.896e-01, 3.266e-01, -4.821e-01, 6.930e-03, 5.641e-03, -1.459e-01, -2.472e-01, 2.856e-01, 1.287e-01, 1.043e-01, 4.405e-01, 1.999e-01, -5.410e-02, 3.628e-01) * s0_1_1;
	r0 += M4(-8.321e-02, -2.656e-01, 9.260e-02, 5.405e-02, -3.715e-01, -3.898e-01, 1.939e-01, -2.749e-02, 2.579e-01, -2.481e-01, 3.015e-01, 3.142e-01, -2.288e-01, -1.697e-01, 1.585e-01, -1.532e-01) * s0_1_2;
	r1 += M4(1.289e-01, -5.706e-02, 1.033e-01, 2.068e-01, 4.812e-01, 3.518e-02, -1.272e-01, 4.975e-02, 5.598e-02, -5.093e-02, 1.023e-02, -1.706e-01, 4.388e-01, 7.300e-03, -7.831e-02, 4.584e-01) * s0_1_2;
	r0 += M4(-7.988e-02, 6.628e-01, 9.325e-02, 4.713e-02, -9.423e-02, -5.755e-02, -1.017e-01, 4.653e-02, 2.810e-01, -2.791e-01, 2.305e-01, 5.082e-02, -2.454e-01, 3.903e-01, -9.890e-02, -1.060e-01) * s0_2_0;
	r1 += M4(-3.171e-01, 2.223e-01, 1.016e-01, -2.997e-01, -1.256e-01, -2.312e-02, -1.605e-02, -1.470e-01, -6.054e-02, 2.838e-02, 4.108e-02, -1.450e-01, -5.528e-02, 5.076e-03, -5.979e-02, 7.576e-02) * s0_2_0;
	r0 += M4(4.734e-01, -7.768e-01, -4.875e-01, 1.294e-01, 1.057e-01, -1.828e-02, 1.382e-01, -9.510e-02, 2.541e-01, -7.897e-02, 2.929e-02, 2.042e-01, -3.576e-01, 1.715e-01, -2.688e-02, 8.829e-03) * s0_2_1;
	r1 += M4(-1.000e+00, -4.001e-01, -5.633e-02, -5.902e-01, -4.371e-02, -1.540e-02, 7.458e-02, 3.601e-03, -4.072e-01, 4.472e-02, 1.508e-01, -3.623e-01, 1.880e-01, 3.842e-02, 2.732e-03, 1.093e-01) * s0_2_1;
	r0 += M4(8.522e-02, 4.538e-02, -2.472e-02, -5.333e-02, 1.843e-01, 4.129e-02, -1.873e-01, 8.822e-02, 2.998e-01, 2.466e-01, -1.492e-01, 6.354e-02, 1.074e-01, -5.736e-02, -6.774e-02, 1.315e-02) * s0_2_2;
	r1 += M4(-3.185e-02, -5.508e-03, -7.594e-02, 1.717e-02, 3.185e-02, 2.248e-02, -6.870e-03, -9.904e-02, -2.378e-01, -5.526e-02, 1.330e-02, -1.314e-01, -2.593e-02, 2.126e-02, 2.943e-02, 7.594e-03) * s0_2_2;
	r0 += M4(-1.681e-02, 3.207e-02, 1.561e-02, -3.717e-03, 4.185e-01, -3.211e-01, -1.347e-01, 3.626e-01, -4.593e-02, -1.340e-02, 3.529e-02, -1.208e-01, 2.548e-02, -2.094e-02, 3.183e-02, 1.154e-01) * s1_0_0;
	r1 += M4(-1.737e-01, -4.892e-02, 1.412e-02, 8.224e-02, 1.948e-01, 1.117e-01, 1.811e-01, -2.939e-01, -6.924e-02, 1.006e-02, -2.183e-02, -5.577e-02, 5.236e-02, -4.419e-03, -7.115e-02, -1.303e-01) * s1_0_0;
	r0 += M4(1.610e-01, -2.277e-02, -1.859e-01, -2.116e-02, -1.103e-01, -7.833e-02, 2.868e-01, -1.191e-01, -5.413e-01, -1.830e-01, 1.193e-01, -2.046e-01, -2.798e-01, 1.855e-01, 5.787e-02, 2.597e-02) * s1_0_1;
	r1 += M4(1.322e-01, 1.811e-02, 2.864e-01, -1.179e-01, -2.212e-01, -1.489e-01, -2.653e-01, -7.070e-02, 4.108e-01, 5.211e-02, -4.170e-01, 2.821e-01, -4.759e-01, -7.631e-02, 4.210e-02, 5.473e-02) * s1_0_1;
	r0 += M4(1.004e-01, 7.086e-03, -2.172e-02, 4.274e-02, -1.213e-01, 1.241e-01, -1.095e-01, -3.957e-02, 2.535e-01, -5.180e-02, 8.519e-02, -1.098e-01, -1.846e-01, 9.316e-03, 1.699e-02, 9.931e-02) * s1_0_2;
	r1 += M4(-1.504e-01, -6.006e-02, 3.918e-02, -1.155e-01, -3.133e-01, 1.294e-02, -2.019e-02, -6.971e-02, 3.561e-02, -4.294e-02, 2.705e-01, -9.707e-02, 2.098e-01, 1.454e-01, 1.316e-01, 2.570e-01) * s1_0_2;
	r0 += M4(-1.000e+00, 1.694e-01, 4.577e-01, -3.535e-01, 5.836e-01, 6.854e-03, 2.045e-02, 6.627e-03, -8.173e-02, -7.347e-02, 1.171e-01, -1.382e-01, 2.366e-01, -2.064e-02, -2.124e-01, 3.978e-01) * s1_1_0;
	r1 += M4(-1.000e+00, 3.090e-02, -3.174e-01, -7.609e-01, -3.841e-01, -1.475e-01, -1.543e-01, -3.232e-01, -9.782e-02, -1.138e-02, 2.073e-02, -2.145e-01, 4.195e-01, 6.958e-02, -3.451e-02, 3.274e-01) * s1_1_0;
	r0 += M4(-4.615e-01, -2.588e-01, 1.933e-01, 1.677e-01, -1.987e-01, -6.211e-01, -4.863e-01, -3.242e-01, -9.387e-01, 4.447e-01, 2.171e-01, 4.502e-01, 3.647e-01, 1.860e-01, -1.683e-01, -2.009e-01) * s1_1_1;
	r1 += M4(2.377e-01, 1.304e-01, 3.446e-01, 1.349e-01, 4.559e-01, 4.012e-02, 5.323e-01, -1.730e-01, 4.924e-02, 7.569e-01, 5.300e-03, 8.070e-02, -2.962e-02, -2.235e-01, -3.263e-01, -2.233e-01) * s1_1_1;
	r0 += M4(2.471e-02, -2.320e-01, 7.255e-02, -4.576e-02, -4.435e-02, 4.219e-01, -9.931e-02, 1.938e-01, 2.428e-01, -1.633e-02, -2.124e-01, -1.719e-02, -1.479e-01, 5.801e-01, -6.412e-02, -5.783e-02) * s1_1_2;
	r1 += M4(4.273e-02, -2.143e-02, 1.765e-02, 4.531e-02, -1.894e-01, -5.881e-02, -1.324e-01, -5.145e-02, -4.747e-01, 2.684e-03, 1.724e-01, 4.132e-02, 1.916e-01, -1.311e-02, -7.760e-02, -1.156e-01) * s1_1_2;
	r0 += M4(4.724e-01, 1.370e-01, 2.607e-01, -2.919e-03, -9.036e-02, 6.226e-02, -1.531e-01, -1.695e-01, 1.693e-01, -4.528e-02, -8.536e-02, 9.589e-03, -2.840e-01, -1.684e-01, -1.363e-01, 4.022e-02) * s1_2_0;
	r1 += M4(-2.407e-01, 1.032e-01, -5.434e-02, -1.949e-01, 7.444e-01, 4.426e-02, -9.009e-02, 6.895e-01, 4.696e-02, -2.167e-02, -2.013e-02, 1.606e-02, 2.490e-01, -5.826e-02, -5.901e-03, 6.904e-02) * s1_2_0;
	r0 += M4(8.377e-02, -3.638e-01, -9.553e-01, -2.787e-02, -5.925e-01, 2.580e-01, -1.840e-01, 5.586e-02, 2.524e-02, -2.618e-02, -6.612e-01, 3.338e-02, -3.398e-01, -2.901e-01, -1.492e-01, -1.301e-01) * s1_2_1;
	r1 += M4(2.156e-01, -9.540e-02, 1.815e-01, -8.468e-02, -1.616e-01, -4.252e-02, 1.093e-02, 2.916e-01, -1.525e-01, -6.038e-02, 3.698e-02, -9.251e-02, 4.132e-01, 6.341e-02, -6.102e-02, 4.000e-01) * s1_2_1;
	r0 += M4(-1.914e-02, 5.435e-02, -1.700e-01, -1.910e-02, 1.745e-02, 2.084e-01, 5.780e-01, -1.332e-02, -2.011e-02, 1.116e-01, -2.123e-01, -5.005e-02, 1.182e-01, -1.768e-01, 1.549e-01, 9.116e-02) * s1_2_2;
	r1 += M4(1.022e-01, -1.014e-02, -4.234e-02, 8.255e-02, -1.104e-01, 1.753e-01, -6.934e-02, -8.916e-02, 7.377e-03, 3.992e-03, -7.477e-03, -2.571e-02, -8.377e-02, -3.220e-04, 1.057e-01, -2.563e-02) * s1_2_2;
	r0 = clamp(r0, V4(0.0), V4(1.0));
	imageStore(out_image, opos + ivec2(0, 0), vec4(r0));
	r1 = clamp(r1, V4(0.0), V4(1.0));
	imageStore(out_image, opos + ivec2(1, 0), vec4(r1));
}

//!DESC [CuNNy_veryfast_SOFT] -conv2
//!HOOK LUMA
//!COMPUTE 8 8 8 8
//!BIND conv1
//!BIND LUMA
//!SAVE conv2
//!WIDTH LUMA.w
//!HEIGHT LUMA.h
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif
#define l0(x, y) V4((conv1_mul * texelFetch(conv1_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(2, 1) + ivec2(0, 0), 0)))
#define l1(x, y) V4((conv1_mul * texelFetch(conv1_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(2, 1) + ivec2(1, 0), 0)))
shared V4 G[2][10][10];
void hook() {
	ivec2 xy = ivec2(gl_LocalInvocationID.xy);
	ivec2 pos = ivec2(gl_WorkGroupID.xy) * ivec2(8, 8) + xy;
	ivec2 opos = pos * ivec2(1, 1);
	ivec2 sz = ivec2(LUMA_size) - ivec2(1);
	for (int y = 0; y < 10; y += 8) {
		int ay = xy.y + y;
		if (ay >= 10) break;
		for (int x = 0; x < 10; x += 8) {
			int ax = xy.x + x;
			if (ax >= 10) break;
			G[0][ay][ax] = l0(x - 1, y - 1);
			G[1][ay][ax] = l1(x - 1, y - 1);
		}
	}
	barrier();
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0;
	r0 = V4(0.0);
	s0_0_0 = G[0][xy.y+0][xy.x+0]; s0_0_1 = G[0][xy.y+0][xy.x+1];
	s0_0_2 = G[0][xy.y+0][xy.x+2]; s0_1_0 = G[0][xy.y+1][xy.x+0];
	s0_1_1 = G[0][xy.y+1][xy.x+1]; s0_1_2 = G[0][xy.y+1][xy.x+2];
	s0_2_0 = G[0][xy.y+2][xy.x+0]; s0_2_1 = G[0][xy.y+2][xy.x+1];
	s0_2_2 = G[0][xy.y+2][xy.x+2]; s1_0_0 = G[1][xy.y+0][xy.x+0];
	s1_0_1 = G[1][xy.y+0][xy.x+1]; s1_0_2 = G[1][xy.y+0][xy.x+2];
	s1_1_0 = G[1][xy.y+1][xy.x+0]; s1_1_1 = G[1][xy.y+1][xy.x+1];
	s1_1_2 = G[1][xy.y+1][xy.x+2]; s1_2_0 = G[1][xy.y+2][xy.x+0];
	s1_2_1 = G[1][xy.y+2][xy.x+1]; s1_2_2 = G[1][xy.y+2][xy.x+2];
	r0 += M4(2.165e-02, 3.026e-02, 4.051e-02, 3.915e-02, -8.661e-02, -4.209e-02, -9.197e-02, -8.715e-02, -1.757e-02, 3.272e-02, 2.691e-02, 3.720e-02, -3.657e-02, -3.421e-02, 9.457e-02, -9.498e-02) * s0_0_0;
	r0 += M4(-3.844e-03, 1.111e-01, -2.155e-01, 8.561e-02, 1.378e-01, -1.463e-02, -2.997e-01, -1.721e-01, -3.336e-02, 6.568e-03, -1.558e-01, -1.757e-01, 1.859e-01, -1.530e-01, 2.372e-01, -5.114e-02) * s0_0_1;
	r0 += M4(-1.593e-01, 1.325e-02, -9.455e-02, 2.536e-02, -1.508e-01, -1.444e-02, 5.664e-02, 6.149e-02, -5.230e-02, 7.373e-03, 9.302e-02, 1.058e-01, -2.542e-02, -5.521e-02, 8.569e-02, -8.126e-02) * s0_0_2;
	r0 += M4(7.059e-03, -2.256e-01, -3.135e-01, -6.569e-02, -4.626e-02, -6.717e-03, -2.683e-02, 3.845e-02, 5.796e-02, -3.883e-01, -2.855e-02, -7.155e-02, -5.518e-02, 1.636e-01, 1.162e-01, 4.832e-02) * s0_1_0;
	r0 += M4(-3.423e-01, -3.567e-01, 1.446e-02, -4.825e-01, -3.835e-01, -3.271e-01, -1.225e-01, -3.975e-01, -1.139e-01, -6.856e-02, -3.757e-02, -3.596e-01, 1.564e-01, 1.209e-02, -2.389e-01, 9.043e-02) * s0_1_1;
	r0 += M4(-7.499e-02, -1.461e-01, -1.823e-02, -2.821e-01, -1.620e-02, 1.028e-01, 6.275e-03, 7.592e-02, 1.056e-01, 9.988e-02, 2.519e-02, 1.703e-01, -1.373e-01, 2.056e-02, -1.912e-02, 1.412e-02) * s0_1_2;
	r0 += M4(-1.091e-02, -1.138e-01, -3.009e-02, -1.811e-01, -4.826e-02, -1.227e-01, 2.871e-02, -5.174e-02, -1.070e-02, 1.071e-01, 3.567e-02, 4.503e-02, -6.667e-02, -2.303e-01, 3.178e-02, -8.346e-02) * s0_2_0;
	r0 += M4(-1.034e-01, -4.693e-02, -3.141e-04, -5.685e-02, -1.026e-01, -7.886e-02, 4.021e-03, -8.618e-02, 4.561e-02, 2.164e-02, -3.332e-02, 1.561e-02, 3.133e-01, 4.816e-02, 8.110e-04, 1.741e-01) * s0_2_1;
	r0 += M4(-7.156e-02, -3.047e-02, -1.382e-02, -6.068e-02, -3.332e-02, -1.972e-02, 3.408e-05, -5.339e-02, 5.886e-02, 7.158e-02, -6.510e-03, 6.955e-02, 6.617e-02, 4.597e-02, 1.283e-02, 2.872e-02) * s0_2_2;
	r0 += M4(3.012e-02, -2.653e-02, 1.491e-02, 2.057e-02, 1.995e-02, -2.607e-02, 2.725e-01, 5.896e-02, 3.895e-03, 1.388e-02, 1.098e-02, 1.906e-02, 1.514e-02, -4.564e-02, -7.828e-02, -1.521e-02) * s1_0_0;
	r0 += M4(-5.846e-03, -5.346e-03, -1.548e-01, -2.817e-02, -1.377e-01, -8.246e-02, 1.596e-01, -1.051e-01, 2.435e-02, -2.421e-02, -6.735e-02, -5.897e-02, -3.703e-02, 1.571e-02, -7.368e-02, 3.135e-02) * s1_0_1;
	r0 += M4(-9.886e-03, -5.964e-03, -1.232e-03, -2.480e-02, 1.056e-01, 3.212e-02, -2.059e-01, -2.742e-02, -4.285e-02, -5.957e-03, -2.441e-02, -2.021e-02, 1.097e-03, -2.363e-02, 7.662e-03, -1.922e-02) * s1_0_2;
	r0 += M4(2.475e-02, 1.587e-01, 4.623e-02, 2.221e-01, 9.528e-02, 6.513e-02, -1.196e-01, -1.614e-02, -1.004e-01, 3.769e-01, 2.771e-01, 2.510e-01, 1.271e-02, -2.648e-01, -4.736e-02, -8.565e-02) * s1_1_0;
	r0 += M4(-4.147e-02, -9.884e-02, -4.189e-01, -3.886e-01, 7.715e-01, 4.817e-01, 3.018e-01, 8.706e-01, -2.961e-02, -5.356e-02, 3.162e-01, 7.178e-02, 7.615e-02, -2.710e-01, 3.433e-01, -3.467e-01) * s1_1_1;
	r0 += M4(-5.225e-02, -1.094e-03, -4.061e-02, 2.662e-02, 1.853e-01, -1.053e-01, -9.647e-02, -1.398e-01, 5.732e-02, -5.526e-03, 3.870e-02, 1.720e-02, -6.596e-02, -3.064e-02, -2.771e-02, -7.296e-02) * s1_1_2;
	r0 += M4(4.133e-02, 9.448e-03, -1.685e-02, 1.614e-02, 7.545e-02, 8.521e-02, -7.719e-02, 2.543e-02, 6.839e-02, 1.392e-01, -7.152e-02, 7.443e-02, 9.340e-03, 1.538e-02, -3.676e-02, -4.460e-02) * s1_2_0;
	r0 += M4(-9.161e-02, -3.467e-01, 5.091e-02, -2.385e-01, -3.661e-02, 1.258e-01, 4.126e-04, 6.453e-02, -3.546e-02, 2.008e-01, 5.913e-03, 1.685e-01, 1.245e-01, 3.584e-01, -5.790e-02, 3.434e-01) * s1_2_1;
	r0 += M4(-3.213e-02, -2.484e-02, -7.514e-03, -4.339e-02, -3.755e-02, -8.872e-02, 4.703e-03, -8.078e-02, 4.865e-03, 9.485e-03, 6.023e-03, 1.481e-02, -7.209e-03, -5.040e-02, -7.514e-03, 5.677e-02) * s1_2_2;
	r0 = clamp(r0, V4(0.0), V4(1.0));
	imageStore(out_image, opos + ivec2(0, 0), vec4(r0));
}

//!DESC [CuNNy_veryfast_SOFT] -out-shuffle
//!HOOK LUMA
//!COMPUTE 16 16 8 8
//!BIND conv2
//!BIND LUMA
//!WIDTH LUMA.w 2 *
//!HEIGHT LUMA.h 2 *
//!COMPONENTS 1
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif
#define l0(x, y) V4((conv2_mul * texelFetch(conv2_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(1, 1) + ivec2(0, 0), 0)))
shared V4 G[1][10][10];
void hook() {
	ivec2 xy = ivec2(gl_LocalInvocationID.xy);
	ivec2 pos = ivec2(gl_WorkGroupID.xy) * ivec2(8, 8) + xy;
	ivec2 opos = pos * ivec2(2, 2);
	ivec2 sz = ivec2(LUMA_size) - ivec2(1);
	for (int y = 0; y < 10; y += 8) {
		int ay = xy.y + y;
		if (ay >= 10) break;
		for (int x = 0; x < 10; x += 8) {
			int ax = xy.x + x;
			if (ax >= 10) break;
			G[0][ay][ax] = l0(x - 1, y - 1);
		}
	}
	barrier();
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2;
	V4 r0;
	r0 = V4(0.0);
	s0_0_0 = G[0][xy.y+0][xy.x+0]; s0_0_1 = G[0][xy.y+0][xy.x+1];
	s0_0_2 = G[0][xy.y+0][xy.x+2]; s0_1_0 = G[0][xy.y+1][xy.x+0];
	s0_1_1 = G[0][xy.y+1][xy.x+1]; s0_1_2 = G[0][xy.y+1][xy.x+2];
	s0_2_0 = G[0][xy.y+2][xy.x+0]; s0_2_1 = G[0][xy.y+2][xy.x+1];
	s0_2_2 = G[0][xy.y+2][xy.x+2];
	r0 += M4(1.255e-01, 5.798e-02, -1.163e-02, -8.770e-03, 3.371e-03, 2.788e-03, 1.468e-02, 4.457e-03, 2.673e-04, 8.711e-04, 3.817e-04, -1.424e-07, 8.862e-02, -1.069e-02, -1.059e-02, -1.579e-02) * s0_0_0;
	r0 += M4(1.861e-01, 1.508e-01, 1.333e-03, -4.843e-03, 4.479e-02, 2.555e-02, 1.787e-01, 3.182e-02, -1.453e-03, 6.543e-03, -4.353e-04, -8.008e-07, -1.167e-01, 1.298e-01, -4.762e-02, 2.935e-03) * s0_0_1;
	r0 += M4(-6.974e-03, 4.382e-02, -7.400e-04, -3.396e-03, -2.124e-01, -3.001e-01, 1.984e-02, 2.780e-01, 1.173e-03, -7.187e-03, -1.022e-03, -6.778e-04, -1.048e-02, 7.156e-02, -2.708e-03, -3.875e-02) * s0_0_2;
	r0 += M4(-1.577e-01, 1.250e-02, -8.648e-02, -4.689e-02, -6.336e-03, -3.293e-03, 1.110e-03, 2.097e-03, 2.277e-03, -1.022e-02, -1.019e-03, -3.308e-03, 1.170e-01, -1.107e-02, 1.322e-01, 2.385e-03) * s0_1_0;
	r0 += M4(-8.021e-02, -6.038e-01, 4.326e-01, -1.053e-01, 2.173e-01, -4.296e-02, 1.675e-01, 2.452e-02, -5.569e-01, -1.812e-01, -9.005e-02, 6.056e-02, -6.368e-02, 5.282e-01, -8.165e-01, 1.111e-01) * s0_1_1;
	r0 += M4(-1.173e-02, 1.275e-01, 4.014e-03, 2.144e-01, 1.665e-01, -1.548e-01, -7.352e-02, -7.022e-01, 3.799e-01, -6.418e-02, -2.039e-02, -2.607e-01, -7.982e-02, 1.027e-01, -2.271e-02, 6.812e-02) * s0_1_2;
	r0 += M4(2.949e-03, 2.134e-03, -8.520e-02, 8.288e-03, -6.587e-04, 1.055e-04, -1.581e-02, -6.011e-03, -1.238e-02, 9.397e-03, 3.083e-03, 2.538e-03, 1.998e-04, -1.396e-03, 6.518e-02, 4.520e-03) * s0_2_0;
	r0 += M4(-9.190e-03, 1.922e-02, -6.755e-02, -1.704e-01, 5.656e-03, 9.584e-03, 1.098e-02, -2.932e-02, -1.432e-02, -3.629e-02, 1.106e-01, 1.470e-01, -4.086e-03, -2.335e-02, 1.047e-01, 1.373e-01) * s0_2_1;
	r0 += M4(7.299e-03, 6.259e-03, -6.744e-03, 6.162e-03, 2.891e-03, -2.629e-03, 4.639e-02, 3.400e-02, 8.160e-03, 6.618e-02, 1.782e-01, 2.162e-01, -1.123e-02, -1.263e-02, -3.014e-02, 3.908e-02) * s0_2_2;
	r0 += V4(-2.727e-11, -1.608e-10, -1.877e-11, -2.711e-10);
	vec2 opt = 0.5 * LUMA_pt;
	vec2 fpos = (vec2(opos) + vec2(0.5)) * opt;
	imageStore(out_image, opos + ivec2(0, 0), vec4(r0.x + LUMA_tex(fpos + vec2(0.0, 0.0) * opt).r, 0.0, 0.0, 1.0));
	imageStore(out_image, opos + ivec2(1, 0), vec4(r0.y + LUMA_tex(fpos + vec2(1.0, 0.0) * opt).r, 0.0, 0.0, 1.0));
	imageStore(out_image, opos + ivec2(0, 1), vec4(r0.z + LUMA_tex(fpos + vec2(0.0, 1.0) * opt).r, 0.0, 0.0, 1.0));
	imageStore(out_image, opos + ivec2(1, 1), vec4(r0.w + LUMA_tex(fpos + vec2(1.0, 1.0) * opt).r, 0.0, 0.0, 1.0));
}
